## Libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(ggplot2)
library(leaflet)
library(leaflet.extras)
library(dbscan)
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(RColorBrewer)
library(scales)
library(ggspatial)
library(knitr)
library(DiagrammeR)
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(foreach)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(mapview)
library(webshot)
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(tidyr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(grDevices)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:xgboost':
##
## slice
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
knitr::opts_chunk$set(webshot = TRUE)
source("src/hdbscan_functions.R")
source("src/xgboost_functions.R")
# Loading the aggregated file
raw_viirs_data = readRDS("datasets/complete_datasets/raw_viirs_data_2023_to_2024.rds")
print(paste("Total number of observed combustion events globally:", nrow(raw_viirs_data)))
## [1] "Total number of observed combustion events globally: 24412309"
# Filtering for points near and within the Permian Basin for faster computational time
raw_viirs_data = raw_viirs_data %>% filter(between(lat, 15, 72), between(lon, -168, -52))
# Part A: Converting 999,999 to NA
raw_viirs_data = raw_viirs_data %>% mutate(across(where(is.numeric), ~ ifelse(. == 999999, NA, .)))
# Part B: Convert to julian_date
raw_viirs_data = raw_viirs_data %>% mutate(julian_date = as.numeric(format(as.Date(date), "%j"))) %>% select(-date)
# Part C: Projecting dataset's 'lon' and 'lat' columns into UTM13 (EPSG:32613) in km
utm_coords = st_coordinates(st_transform(st_as_sf(raw_viirs_data, coords = c("lon", "lat"), crs = 4326), crs = 32613)) / 1000
raw_viirs_data = raw_viirs_data %>% mutate(lon_km_utm13 = utm_coords[,1], lat_km_utm13 = utm_coords[,2]) %>% select(-lon, -lat)
# Part A: Loading the shapefile
shape_data = st_read("shape_file/pb_shape.shp")
## Reading layer `pb_shape' from data source
## `/Users/ethanzaj/Desktop/Research/Permian Basin, MF/R File/permian_basin_final/shape_file/pb_shape.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 7 features and 12 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: -104.8068 ymin: 30.63042 xmax: -100.671 ymax: 33.70039
## Geodetic CRS: WGS 84
# Part B: Transform shapefile to UTM13 and convert to km
shape_data = shape_data %>% st_transform(crs = 32613)
shape_data = st_set_geometry(shape_data, st_geometry(shape_data) / 1000)
# Part C: Convert dataset into spatial object using UTM13 coordinates in km
points_sf = st_as_sf(raw_viirs_data, coords = c("lon_km_utm13", "lat_km_utm13"), remove = FALSE)
st_crs(shape_data) = NA
# Part D: Final intersection
intersection_indices = st_intersects(points_sf, shape_data, sparse = FALSE)
points_within_basin = points_sf[apply(intersection_indices, 1, any), ] %>% select(-id)
print(paste("Total number of potential Permian Basin combustion events:", nrow(points_within_basin)))
## [1] "Total number of potential Permian Basin combustion events: 139907"
# Garbage Removal
rm(utm_coords)
Filter for temperature >1600K
points_final = points_within_basin %>% filter(temp_bb > 1600)
print(paste("Total number of observed Permian Basin combustion events with Temperature > 1600K:", nrow(points_final)))
## [1] "Total number of observed Permian Basin combustion events with Temperature > 1600K: 80007"
source("src/hdbscan_functions.R")
hdbscan_analysis_df = hdbscan_analysis(mode = 1)
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
## Warning in attr(x, "format"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
##
##
## Table: Summary Statistics for HDBSCAN Clustering
##
## | minPts| clustered_obs| noise_obs| noise_percentage| clusters| ratio_flares_per_cluster|
## |------:|-------------:|---------:|----------------:|--------:|------------------------:|
## | 2| 67993| 12014| 15.016186| 23042| 2.950829|
## | 3| 64402| 15605| 19.504543| 7324| 8.793282|
## | 4| 72056| 7951| 9.937880| 2462| 29.267262|
## | 5| 73387| 6620| 8.274276| 1706| 43.016999|
## | 6| 74079| 5928| 7.409352| 1417| 52.278758|
## | 7| 73478| 6529| 8.160536| 1309| 56.132926|
## | 8| 73010| 6997| 8.745485| 1224| 59.648693|
Merging the HDBSCAN data with the viirs dataset:
source("src/hdbscan_functions.R")
hdbscan_dataset = hdbscan_dataset_merge(mode = 1, minPts = 6)
# Part A: Filter by Cluster == 0 Noise Points & Filter By Non-Na Methane_EQ Noise Points
hdbscan_dataset_noise = hdbscan_dataset %>% filter(cluster == 0)
noise_with_methane = hdbscan_dataset_noise %>% filter(!is.na(methane_eq))
# Part B: Proportion Calculations
total_noise_points = nrow(hdbscan_dataset_noise)
non_na_methane_noise = nrow(noise_with_methane)
na_methane_noise = total_noise_points - non_na_methane_noise
cat("Noise Points (Cluster == 0):\n")
## Noise Points (Cluster == 0):
cat("Total points:", total_noise_points, "\n")
## Total points: 5928
cat("Non-NA methane_eq:", non_na_methane_noise, "(", round(non_na_methane_noise/total_noise_points*100, 1), "%)\n")
## Non-NA methane_eq: 1122 ( 18.9 %)
cat("NA methane_eq:", na_methane_noise, "(", round(na_methane_noise/total_noise_points*100, 1), "%)\n")
## NA methane_eq: 4806 ( 81.1 %)
# Part A: Creating Noise Distance Dataset
hdbscan_non_na_methane_noise_dataset = hdbscan_noise_distance_calculation(mode = 1,
noise_with_methane = noise_with_methane,
hdbscan_dataset_filtered = hdbscan_dataset_filtered)
# Part B: Creating Frequency Distribution
mean_distance = mean(hdbscan_non_na_methane_noise_dataset$nearest_cluster_distance)
median_distance = median(hdbscan_non_na_methane_noise_dataset$nearest_cluster_distance)
distance_plot = ggplot(hdbscan_non_na_methane_noise_dataset, aes(x = nearest_cluster_distance)) +
geom_histogram(bins = 100, fill = "black", color = "black", alpha = 0.7) +
geom_segment(aes(x = mean_distance, xend = mean_distance, y = 0, yend = 80), color = "red", linetype = "dashed", size = 1) +
geom_segment(aes(x = median_distance, xend = median_distance, y = 0, yend = 100), color = "red", linetype = "dashed", size = 1) +
annotate("text", x = mean_distance + 0.5, y = 80, label = paste("Mean =", round(mean_distance, 2), "km"), color = "red", vjust = 0, hjust = 0) +
annotate("text", x = median_distance + 0.55, y = 100, label = paste("Median =", round(median_distance, 2), "km"), color = "red", vjust = 0, hjust = 0) +
coord_cartesian(xlim = c(0, 5)) +
labs(title = "Distance from Noise Points (with Methane) to Nearest Cluster", x = "Distance to Nearest Cluster (km)", y = "Frequency") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
interactive_plot = ggplotly(distance_plot, tooltip = c("x", "y"))
## Warning in geom_segment(aes(x = mean_distance, xend = mean_distance, y = 0, : All aesthetics have length 1, but the data has 1122 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
## Warning in geom_segment(aes(x = median_distance, xend = median_distance, : All aesthetics have length 1, but the data has 1122 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
interactive_plot
Breakdown of the above noise:
DiagrammeR::grViz("
digraph flowchart {
graph [layout = dot, rankdir = TB]
# Node styles
node [shape = rectangle, style = filled, fontname = Arial]
# Level 1 - Initial classification
A [label = '5,928 observations\\n(classified as noise by HDBSCAN)']
# Level 2 - Methane data split
B1 [label = '1,122 observations (18.9%)\\n(with Non-NA Methane Eq.)']
B2 [label = '4,806 observations (81.1%)\\n(with NA Methane Eq.)']
# Level 3 - Distance analysis (only for B1 - points with methane data)
C1 [label = '585 observations (52.2% of 1,122)\\n(within 750m of a nearest cluster)']
C2 [label = '537 observations (47.8% of 1,122)\\n(beyond 750m of a nearest cluster)']
# Edges (diagonal is fine for these)
A -> B1
A -> B2
B1 -> C1
B1 -> C2
# Add some spacing between levels
{rank = same; B1; B2}
{rank = same; C1; C2}
}
")
# Part A: Reassign points within 750m of the nearest edge to their nearest cluster
points_to_reassign = hdbscan_non_na_methane_noise_dataset %>% filter(nearest_cluster_distance <= 0.75 & !is.na(methane_eq))
# Part B: Index-Match the the non-na methane_eq noise point, then replace it's cluster id
if(nrow(points_to_reassign) > 0) {
for(i in 1:nrow(points_to_reassign)) {
row_idx = which(hdbscan_dataset$cluster == 0 &
hdbscan_dataset$lon_km_utm13 == points_to_reassign$lon_km_utm13[i] &
hdbscan_dataset$lat_km_utm13 == points_to_reassign$lat_km_utm13[i])
if(length(row_idx) > 0) {
hdbscan_dataset$cluster[row_idx] = points_to_reassign$nearest_cluster[i]
}
}
}
# Part C: Removing remaining noise points
hdbscan_dataset_final = hdbscan_dataset %>% filter(cluster != 0)
# Part D: Summary Statistics
before_count = nrow(hdbscan_dataset)
after_count = nrow(hdbscan_dataset_final)
removed_count = before_count - after_count
reassigned_count = nrow(points_to_reassign)
cat(sprintf("Noise points with non-NA methane_eq within 0.75km of nearest cluster: %d\n", reassigned_count))
## Noise points with non-NA methane_eq within 0.75km of nearest cluster: 585
cat(sprintf("Total number of potential Permian Basin combustion events that are non-noise observations: %d\n", after_count))
## Total number of potential Permian Basin combustion events that are non-noise observations: 74664
cat(sprintf("Points removed: %d (%.1f%%)\n", removed_count, (removed_count/before_count)*100))
## Points removed: 5343 (6.7%)
# (Optional) Save the dataset
save_hdbscan_final_dataset = function(dataset, minPts) {
csv_filename = sprintf("datasets/complete_datasets/final_hdbscan_dataset.csv", minPts)
write.csv(dataset, csv_filename, row.names = FALSE)
}
save_hdbscan_final_dataset(hdbscan_dataset_final, 6)
## Warning in sprintf("datasets/complete_datasets/final_hdbscan_dataset.csv", :
## one argument not used by format
## 'datasets/complete_datasets/final_hdbscan_dataset.csv'
Map of the HDBSCAN clusters identified
# Part A: Find cluster centers & project to WGS84
cluster_centroids = hdbscan_dataset_final %>%
group_by(cluster) %>%
summarise(
centroid_lon_m = mean(lon_km_utm13, na.rm = TRUE) * 1000,
centroid_lat_m = mean(lat_km_utm13, na.rm = TRUE) * 1000,
cluster_size = n(),
.groups = 'drop'
)
centroids_sf = st_as_sf(cluster_centroids, coords = c("centroid_lon_m", "centroid_lat_m"), crs = 32613) %>% st_transform(crs = 4326)
# Part B: Create unified shape boundary
shape_data_combined = shape_data %>%
st_set_geometry(st_geometry(.) * 1000) %>%
st_set_crs(32613) %>%
st_union() %>%
st_sf() %>%
st_transform(crs = 4326)
cluster_centroids_geo = cluster_centroids %>% mutate(lon_geo = st_coordinates(centroids_sf)[,1], lat_geo = st_coordinates(centroids_sf)[,2])
# Part C: HDBSCAN Map with scale bar
hdbscan_map = leaflet() %>%
addTiles() %>%
addPolygons(data = shape_data_combined,
fillColor = "lightblue",
fillOpacity = 0.3,
color = "darkblue",
weight = 2,
popup = "Permian Basin") %>%
addCircleMarkers(data = cluster_centroids_geo,
lng = ~lon_geo,
lat = ~lat_geo,
radius = 3,
color = "red",
fillColor = "red",
fillOpacity = 0.7,
popup = ~paste("Cluster:", cluster, "<br>",
"Size:", cluster_size, "observations<br>",
"Coordinates:", round(lon_geo, 4), ",", round(lat_geo, 4))) %>%
addScaleBar(position = "bottomright",
options = scaleBarOptions(
maxWidth = 100,
metric = TRUE,
imperial = TRUE,
updateWhenIdle = TRUE
))
hdbscan_map
# Part A: Convert data to UTM13 coordinates for use
heatmap_points = hdbscan_dataset_final %>%
st_as_sf(coords = c("lon_km_utm13", "lat_km_utm13"), crs = 32613) %>%
st_set_geometry(st_geometry(.) * 1000) %>%
st_set_crs(32613) %>%
st_transform(crs = 4326)
heatmap_coords = st_coordinates(heatmap_points)
heatmap_data = data.frame(lat = heatmap_coords[,2], lng = heatmap_coords[,1])
# Part B: HDBSCAN Heatmap
heatmap_leaflet = leaflet() %>%
addTiles() %>%
addPolygons(data = shape_data_combined,
fillColor = "lightblue",
color = "darkblue",
weight = 2,
fillOpacity = 0.3,
popup = "Permian Basin") %>%
addHeatmap(data = heatmap_data,
lng = ~lng,
lat = ~lat,
blur = 10, # Increased blur for smoother transitions
max = 0.6, # Reduced max for better blending
radius = 10, # Increased radius for more overlap
minOpacity = 0.1, # Add minimum opacity for continuous feel
gradient = c("purple", "blue", "cyan", "yellow", "orange", "red")) %>%
addLegend(
position = "bottomright",
colors = c("purple", "blue", "cyan", "yellow", "orange", "red"),
labels = c("0-10", "10-25", "25-50", "50-100", "100-150", "150+"),
title = "Flares/km²",
opacity = 0.8
) %>%
addScaleBar(position = "bottomright",
options = scaleBarOptions(
maxWidth = 100,
metric = TRUE,
imperial = TRUE,
updateWhenIdle = TRUE
))
heatmap_leaflet
# Part A: Filter By Missing and Non Missing
hdbscan_dataset_non_na = hdbscan_dataset_final %>% filter(!is.na(methane_eq))
hdbscan_dataset_na = hdbscan_dataset_final %>% filter(is.na(methane_eq))
# Part B: Proportion Calculations
total_obs = nrow(hdbscan_dataset_final)
methane_non_na = nrow(hdbscan_dataset_non_na)
methane_na = nrow(hdbscan_dataset_na)
methane_non_na_pct = round(100 * methane_non_na / total_obs, 2)
methane_na_pct = round(100 * methane_na / total_obs, 2)
# Part C: Summary
print(paste("Total number of potential Permian Basin combustion events (valid):", total_obs))
## [1] "Total number of potential Permian Basin combustion events (valid): 74664"
print(paste("Non-NA methane_eq observations:", methane_non_na, "(", methane_non_na_pct, "%)"))
## [1] "Non-NA methane_eq observations: 22087 ( 29.58 %)"
print(paste("NA methane_eq observations:", methane_na, "(", methane_na_pct, "%)"))
## [1] "NA methane_eq observations: 52577 ( 70.42 %)"
Breakdown of above:
DiagrammeR::grViz("
digraph flowchart {
graph [layout = dot, rankdir = TB]
# Nodes
node [shape = rectangle, style = filled, fillcolor = lightgrey, fontname = Arial]
A [label = '24,412,309 observations\n(Total Observations from VIIRS Satellite)']
B [label = '139,907 observations\n(Intersected Observations Within Permian Basin)']
C [label = '80,007 observations\n(Temperature ≥ 1600 K)']
D [label = '74,664 observations\n(Clustered by HDBSCAN)']
E1 [label = '22,087 observations (29.6%)\n(Non-Missing Methane)']
E2 [label = '52,577 observations (70.4%)\n(Missing Methane)']
# Edges
A -> B [label = ' Spatial Intersection']
B -> C [label = ' Temperature ≥ 1600 K']
C -> D [label = ' 5,343 Points Removed']
D -> E1
D -> E2 [label = '']
}
")
Using a Mann-Whitnney U Test to determine if there are significant difference between the sample with missing methane_eq and non-missing methane_eq
features_to_test = c("temp_bb", "rh", "area_bb", "cloud_mask", "lon_km_utm13", "lat_km_utm13", "julian_date")
# Function to perform Mann-Whitney U test and calculate effect size
perform_mann_whitney = function(feature_name, missing_data, non_missing_data) {
# Extract feature values (remove NA values)
missing_values = missing_data[[feature_name]]
non_missing_values = non_missing_data[[feature_name]]
# Remove NA values
missing_values = missing_values[!is.na(missing_values)]
non_missing_values = non_missing_values[!is.na(non_missing_values)]
# Check if we have enough data
if (length(missing_values) < 5 || length(non_missing_values) < 5) {
return(data.frame(
feature = feature_name,
n_missing = length(missing_values),
n_non_missing = length(non_missing_values),
median_missing = NA,
median_non_missing = NA,
median_difference = NA,
mann_whitney_statistic = NA,
p_value = NA,
effect_size_r = NA,
effect_size_interpretation = "Insufficient data"
))
}
# Perform Mann-Whitney U test (Wilcoxon rank-sum test)
test_result = wilcox.test(missing_values, non_missing_values, alternative = "two.sided")
# Calculate medians and difference
median1 = median(missing_values)
median2 = median(non_missing_values)
n1 = length(missing_values)
n2 = length(non_missing_values)
n_total = n1 + n2
# Calculate effect size (r = Z / sqrt(N))
# Convert W statistic to Z-score approximation for large samples
expected_w = n1 * n2 / 2
variance_w = n1 * n2 * (n1 + n2 + 1) / 12
z_score = (test_result$statistic - expected_w) / sqrt(variance_w)
effect_size_r = abs(z_score) / sqrt(n_total)
# Interpret effect size
if (effect_size_r < 0.1) {
effect_interpretation = "Negligible"
} else if (effect_size_r < 0.3) {
effect_interpretation = "Small"
} else if (effect_size_r < 0.5) {
effect_interpretation = "Medium"
} else {
effect_interpretation = "Large"
}
return(data.frame(
feature = feature_name,
n_missing = n1,
n_non_missing = n2,
median_missing = median1,
median_non_missing = median2,
median_difference = median1 - median2,
mann_whitney_statistic = as.numeric(test_result$statistic),
p_value = test_result$p.value,
effect_size_r = effect_size_r,
effect_size_interpretation = effect_interpretation
))
}
# Perform Tests On All Features
results_list = list()
for (feature in features_to_test) {
results_list[[feature]] = perform_mann_whitney(feature, hdbscan_dataset_na, hdbscan_dataset_non_na)
}
# Combine Results
test_results = do.call(rbind, results_list)
# Represent Results
clean_table = test_results %>%
select(feature, median_missing, median_non_missing, median_difference, mann_whitney_statistic, p_value, effect_size_r) %>%
mutate(
median_missing = round(median_missing, 2),
median_non_missing = round(median_non_missing, 2),
median_difference = round(median_difference, 2),
mann_whitney_statistic = round(mann_whitney_statistic, 2),
p_value = ifelse(p_value < 1e-10, sprintf("%.2e", p_value), round(p_value, 6)),
effect_size_r = round(effect_size_r, 3)
)
# Print Table
kable(clean_table,
col.names = c("feature", "median_missing", "median_non_missing", "median_difference", "mann_whitney_statistic", "p_value", "effect_size_r"),
caption = "Mann-Whitney U Test: Median Comparison Between Missing and Non-Missing Methane Groups")
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
## Warning in attr(x, "format"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
| feature | median_missing | median_non_missing | median_difference | mann_whitney_statistic | p_value | effect_size_r | |
|---|---|---|---|---|---|---|---|
| temp_bb | temp_bb | 1897.00 | 1872.00 | 25.00 | 639737090 | 3.79e-107 | 0.080 |
| rh | rh | 0.80 | 0.72 | 0.08 | 606764756 | 2.45e-22 | 0.036 |
| area_bb | area_bb | 1.05 | 1.00 | 0.04 | 583672822 | 0.258281 | 0.004 |
| cloud_mask | cloud_mask | 0.00 | 0.00 | 0.00 | 482849756 | 0.688002 | 0.001 |
| lon_km_utm13 | lon_km_utm13 | 786.38 | 680.35 | 106.02 | 698464783 | 0.00e+00 | 0.160 |
| lat_km_utm13 | lat_km_utm13 | 3570.14 | 3543.39 | 26.75 | 686778140 | 0.00e+00 | 0.145 |
| julian_date | julian_date | 179.00 | 176.00 | 3.00 | 586635188 | 0.025579 | 0.008 |
dataset_filtered = hdbscan_dataset_final %>% filter(!is.na(methane_eq))
features = c("temp_bb", "rh", "area_bb", "cloud_mask", "lon_km_utm13", "lat_km_utm13", "julian_date")
all_results = evaluate_or_load(mode = 1, dataset_filtered, features, save_models = TRUE)
## Loaded 97200 results from: datasets/xgboost_datasets/xgboost_hyperparameter_results.csv
Model Parameters:
set.seed(1234)
# Part A: Dataset Modifications (Adding Column For Missingness Status Of Target Methane_EQ)
raw_dataset = hdbscan_dataset_final %>%
mutate(methane_eq_missingness_indicator = as.integer(is.na(methane_eq))) %>%
relocate(methane_eq_missingness_indicator, .after = methane_eq)
# Part B: Create Model
results = create_xgb_model_and_predictions(
all_results = all_results,
dataset_split_value = 0.75,
eta_value = 0.3,
max_depth_value = 7.0,
subsample_value = 1.0,
colsample_bytree_value = 1.0,
min_child_weight_value = 1.0,
gamma_value = 0.0,
lambda_value = 0.1,
alpha_value = 0.0,
observed_only_dataset = hdbscan_dataset_non_na,
raw_dataset = raw_dataset,
features = features,
nrounds = 100
)
model = results$model
model_dataset = results$model_dataset
filtered_row = results$filtered_row
Histogram of actual methane_eq data (target):
plot_observed_variable(hdbscan_dataset_non_na, raw_dataset)
## Warning: Removed 52577 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Removed 52577 rows containing non-finite outside the scale range
## (`stat_bin()`).
Histogram of predicted methane_eq data (by model):
plot_observed_histogram(model_dataset)
Histogram of predicted methane_eq by model across all points in the dataset (both missing and non-missing):
plot_model_histogram(model_dataset, title = "Histogram Of Model (Observed + Missing), Highest R2 Model")
Obsered vs Predicted plot. Red line is the true y=x line, blue line is the bias:
train_r2_observed_vs_predicted_plot = plot_observed_vs_predicted(data = model_dataset, title = "Observed vs Predicted, Highest R2 Model")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
train_r2_observed_vs_predicted_plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 52577 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 52577 rows containing missing values or values outside the scale range
## (`geom_point()`).
Bias Analysis
calculate_and_display_bias_metrics(data = model_dataset, caption = "Bias Summary Metrics, Highest R2 Model")
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
| Mean Bias Estimate | MAE | RMSE | Percent Bias |
|---|---|---|---|
| -0.000191 | 0.006017 | 0.01316 | -0.172262 |
Training vs Testing Evaluation
display_train_test_metrics(filtered_row = filtered_row, caption = "Training vs Testing Metrics, Highest R2 Model")
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
| Train R² | Test R² | Train RMSE | Test RMSE | Train MAE | Test MAE |
|---|---|---|---|---|---|
| 0.9999 | 0.9191 | 0.0017 | 0.0608 | 0.0012 | 0.016 |
(Below is not included in the paper, but is still interesting to note)
Feature Importance
plot_feature_importance(model = model, feature_names = features, top_n = 20)
Diagnostic plots (except for Cook’s Distance, as this is not a linear model):
create_diagnostic_plots(model_dataset)
Histogram of all the imputed methane_eq values but differentiated by color. Green is the observed values, blue are the predicted values from the model:
result_train_r2 = plot_imputed_histogram(model_dataset)
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Peak Analysis:
## Observed Peak: 0.041
## Imputed Peak: 0.045
## Peak Difference (Imputed - Observed): 0.004
result_train_r2$plot
observed_peak = result_train_r2$peak_observed
imputed_peak = result_train_r2$peak_imputed
peak_diff = result_train_r2$peak_difference
results_train_r2 = extract_and_display_clustering_metrics(
model = model,
data = model_dataset,
name = "Train R2",
peak_obs_lbl = observed_peak,
peak_imp_lbl = imputed_peak,
peak_diff_lbl = peak_diff,
caption = "Statistics, Highest R2 Model"
)
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
results_train_r2$table
| Model | Observed Mean | Imputed Mean | Observed Variance | Imputed Variance | Observed SD | Imputed SD | Observed Peak | Imputed Peak | Difference In Peaks | Observed # Of Rows | Total # Of Rows |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Train R2 | 0.111 | 0.106 | 0.028 | 0.017 | 0.168 | 0.13 | 0.041 | 0.045 | 0.004 | 22087 | 74664 |
From Jan 1, 2013 to Dec 31, 2024
# Part 1: Loading Dataset and Date modifications
decade_dataset = readRDS("datasets/time_series_datasets/raw_viirs_data_2013_to_2024.rds")
decade_dataset$date = as.Date(decade_dataset$date)
decade_dataset$year = as.numeric(format(decade_dataset$date, "%Y"))
# Part 2: Count flares by year
flare_counts = decade_dataset %>%
group_by(year) %>%
summarise(flare_count = n(), .groups = 'drop') %>%
arrange(year)
# Part 3: Summary Statistics
print(flare_counts)
## # A tibble: 12 × 2
## year flare_count
## <dbl> <int>
## 1 2013 10667
## 2 2014 14124
## 3 2015 22296
## 4 2016 22044
## 5 2017 22176
## 6 2018 61536
## 7 2019 80731
## 8 2020 52869
## 9 2021 37636
## 10 2022 76272
## 11 2023 48713
## 12 2024 41681
total_flares = sum(flare_counts$flare_count)
average_flares = round(mean(flare_counts$flare_count), 0)
median_flares = median(flare_counts$flare_count)
max_year = flare_counts$year[which.max(flare_counts$flare_count)]
max_flares = max(flare_counts$flare_count)
min_year = flare_counts$year[which.min(flare_counts$flare_count)]
min_flares = min(flare_counts$flare_count)
cat("Total flares (2013-2023):", format(total_flares, big.mark = ","), "\n")
## Total flares (2013-2023): 490,745
cat("Average flares per year:", format(average_flares, big.mark = ","), "\n")
## Average flares per year: 40,895
cat("Median flares per year:", format(median_flares, big.mark = ","), "\n")
## Median flares per year: 39,658.5
cat("Maximum:", max_year, "with", format(max_flares, big.mark = ","), "flares\n")
## Maximum: 2019 with 80,731 flares
cat("Minimum:", min_year, "with", format(min_flares, big.mark = ","), "flares\n")
## Minimum: 2013 with 10,667 flares
# Part 4: Bar Graph
flare_bar_plot = ggplot(flare_counts, aes(x = factor(year), y = flare_count)) +
geom_bar(stat = "identity", fill = "black", color = "black", alpha = 0.7, width = 0.8) +
geom_text(aes(x = factor(year), y = flare_count + 2000, label = format(flare_count, big.mark = ",")),
size = 4, color = "black", hjust = 0.5) +
labs(
title = "Number of Flares by Year",
x = "Year",
y = "Frequency"
) +
scale_y_continuous(labels = scales::comma_format()) +
coord_cartesian(ylim = c(0, max(flare_counts$flare_count) * 1.1)) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 3),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank()
)
interactive_flare_plot = ggplotly(flare_bar_plot, tooltip = c("x", "y"))
interactive_flare_plot
YoY percent change:
# Part 1: Calculate Year-over-Year Percentage Change
flare_change = flare_counts %>%
arrange(year) %>%
mutate(
previous_year_count = lag(flare_count),
yoy_change = ((flare_count - previous_year_count) / previous_year_count) * 100,
yoy_change_rounded = round(yoy_change, 1)
) %>%
filter(!is.na(yoy_change)) # Remove first year (no previous year to compare)
# Part 2: Summary Statistics for YoY Changes
print("Year-over-Year Changes:")
## [1] "Year-over-Year Changes:"
average_change = round(mean(flare_change$yoy_change), 1)
median_change = round(median(flare_change$yoy_change), 1)
max_increase_year = flare_change$year[which.max(flare_change$yoy_change)]
max_increase = round(max(flare_change$yoy_change), 1)
max_decrease_year = flare_change$year[which.min(flare_change$yoy_change)]
max_decrease = round(min(flare_change$yoy_change), 1)
cat("\n=== Year-over-Year Change Summary ===\n")
##
## === Year-over-Year Change Summary ===
cat("Average YoY change:", average_change, "%\n")
## Average YoY change: 26.1 %
cat("Median YoY change:", median_change, "%\n")
## Median YoY change: 0.6 %
cat("Largest increase:", max_increase_year, "with", max_increase, "% increase\n")
## Largest increase: 2018 with 177.5 % increase
cat("Largest decrease:", max_decrease_year, "with", max_decrease, "% decrease\n")
## Largest decrease: 2023 with -36.1 % decrease
# Part 3: Bar Graph for YoY Percentage Change
yoy_change_plot = ggplot(flare_change, aes(x = factor(year), y = yoy_change)) +
geom_bar(stat = "identity",
aes(fill = yoy_change > 0),
color = "black",
alpha = 0.7,
width = 0.8) +
scale_fill_manual(values = c("TRUE" = "darkgreen", "FALSE" = "darkred"), guide = "none") +
geom_text(aes(x = factor(year),
y = ifelse(yoy_change > 0, yoy_change + 5, yoy_change - 7),
label = paste0(yoy_change_rounded, "%")),
size = 4,
color = "black",
hjust = 0.5) +
geom_hline(yintercept = 0, color = "black", linewidth = 0.8) +
labs(
title = "Year-over-Year Percentage Change in Number of Flares",
x = "Year",
y = "Percentage Change (%)"
) +
scale_y_continuous(labels = function(x) paste0(x, "%")) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "none"
)
# Part 4: Interactive Plot
interactive_yoy_plot = ggplotly(yoy_change_plot, tooltip = c("x", "y"))
interactive_yoy_plot
Summary statistics of all the clusters
# Part A: Calculate cluster sizes and spatial dimensions
cluster_analysis = hdbscan_dataset_final %>%
group_by(cluster) %>%
summarise(
cluster_size = n(),
lon_range_km = max(lon_km_utm13) - min(lon_km_utm13),
lat_range_km = max(lat_km_utm13) - min(lat_km_utm13),
cluster_area_km2 = lon_range_km * lat_range_km,
centroid_lon = mean(lon_km_utm13),
centroid_lat = mean(lat_km_utm13),
.groups = 'drop'
)
# Part B: Summary Statistics
cat("\nCLUSTER SPATIAL SIZE STATISTICS:\n")
##
## CLUSTER SPATIAL SIZE STATISTICS:
cat("================================\n")
## ================================
cat("Average longitude range (km):", round(mean(cluster_analysis$lon_range_km), 2), "\n")
## Average longitude range (km): 1.85
cat("Average latitude range (km):", round(mean(cluster_analysis$lat_range_km), 2), "\n")
## Average latitude range (km): 1.53
cat("Average cluster area (km²):", round(mean(cluster_analysis$cluster_area_km2), 2), "\n")
## Average cluster area (km²): 3.61
cat("Median longitude range (km):", round(median(cluster_analysis$lon_range_km), 2), "\n")
## Median longitude range (km): 1.61
cat("Median latitude range (km):", round(median(cluster_analysis$lat_range_km), 2), "\n")
## Median latitude range (km): 1.27
cat("Median cluster area (km²):", round(median(cluster_analysis$cluster_area_km2), 2), "\n")
## Median cluster area (km²): 2.08
cat("\nCLUSTER SPATIAL DISTRIBUTION:\n")
##
## CLUSTER SPATIAL DISTRIBUTION:
cat("=============================\n")
## =============================
cat("Longitude Range (km):\n")
## Longitude Range (km):
summary(cluster_analysis$lon_range_km)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1076 1.0752 1.6083 1.8475 2.3459 11.8702
cat("\nLatitude Range (km):\n")
##
## Latitude Range (km):
summary(cluster_analysis$lat_range_km)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1611 0.9377 1.2730 1.5329 1.8362 24.3017
cat("\nCluster Area (km²):\n")
##
## Cluster Area (km²):
summary(cluster_analysis$cluster_area_km2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02342 1.05870 2.07545 3.61175 4.09384 107.63705
# Part C: Histogram Of Cluster
mean_area = mean(cluster_analysis$cluster_area_km2)
median_area = median(cluster_analysis$cluster_area_km2)
sd_area = sd(cluster_analysis$cluster_area_km2)
lower_bound = mean_area - 2 * sd_area
upper_bound = mean_area + 2 * sd_area
# Create interactive plotly histogram
cluster_plot_interactive = plot_ly(
data = cluster_analysis,
x = ~cluster_area_km2,
type = "histogram",
nbinsx = 500,
marker = list(color = "lightgreen", line = list(color = "black", width = 1)),
opacity = 0.7,
hovertemplate = "Area: %{x:.3f} km²<br>Count: %{y}<extra></extra>"
) %>%
add_lines(x = c(mean_area, mean_area), y = c(0, 100),
line = list(color = "red", dash = "dash", width = 2),
showlegend = FALSE, hoverinfo = "none") %>%
add_lines(x = c(median_area, median_area), y = c(0, 100),
line = list(color = "blue", dash = "dash", width = 2),
showlegend = FALSE, hoverinfo = "none") %>%
add_annotations(
x = mean_area, y = 90,
text = paste("Mean =", round(mean_area, 2), "km²"),
showarrow = FALSE,
font = list(color = "red"),
xanchor = "left"
) %>%
add_annotations(
x = median_area, y = 80,
text = paste("Median =", round(median_area, 2), "km²"),
showarrow = FALSE,
font = list(color = "blue"),
xanchor = "left"
) %>%
layout(
title = "Distribution of Cluster Areas (Initial view: 2 Standard Deviations)",
xaxis = list(
title = "Cluster Area (km²)",
range = c(lower_bound, upper_bound)
),
yaxis = list(title = "Frequency (Number of Clusters)"),
hovermode = "x"
)
cluster_plot_interactive
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...
## Warning: 'scatter' objects don't have these attributes: 'nbinsx'
## Valid attributes include:
## 'cliponaxis', 'connectgaps', 'customdata', 'customdatasrc', 'dx', 'dy', 'error_x', 'error_y', 'fill', 'fillcolor', 'fillpattern', 'groupnorm', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hoveron', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'line', 'marker', 'meta', 'metasrc', 'mode', 'name', 'opacity', 'orientation', 'selected', 'selectedpoints', 'showlegend', 'stackgaps', 'stackgroup', 'stream', 'text', 'textfont', 'textposition', 'textpositionsrc', 'textsrc', 'texttemplate', 'texttemplatesrc', 'transforms', 'type', 'uid', 'uirevision', 'unselected', 'visible', 'x', 'x0', 'xaxis', 'xcalendar', 'xhoverformat', 'xperiod', 'xperiod0', 'xperiodalignment', 'xsrc', 'y', 'y0', 'yaxis', 'ycalendar', 'yhoverformat', 'yperiod', 'yperiod0', 'yperiodalignment', 'ysrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
## Warning: 'scatter' objects don't have these attributes: 'nbinsx'
## Valid attributes include:
## 'cliponaxis', 'connectgaps', 'customdata', 'customdatasrc', 'dx', 'dy', 'error_x', 'error_y', 'fill', 'fillcolor', 'fillpattern', 'groupnorm', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hoveron', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'line', 'marker', 'meta', 'metasrc', 'mode', 'name', 'opacity', 'orientation', 'selected', 'selectedpoints', 'showlegend', 'stackgaps', 'stackgroup', 'stream', 'text', 'textfont', 'textposition', 'textpositionsrc', 'textsrc', 'texttemplate', 'texttemplatesrc', 'transforms', 'type', 'uid', 'uirevision', 'unselected', 'visible', 'x', 'x0', 'xaxis', 'xcalendar', 'xhoverformat', 'xperiod', 'xperiod0', 'xperiodalignment', 'xsrc', 'y', 'y0', 'yaxis', 'ycalendar', 'yhoverformat', 'yperiod', 'yperiod0', 'yperiodalignment', 'ysrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
Check the number of missing variables on the selected features
variables_to_check = c("temp_bb", "rh", "area_bb", "cloud_mask", "lon_km_utm13", "lat_km_utm13", "julian_date")
total_non_na = nrow(hdbscan_dataset_non_na)
for(var in variables_to_check) {
na_count = sum(is.na(hdbscan_dataset_non_na[[var]]))
na_pct = round(100 * na_count / total_non_na, 2)
print(paste(var, "missing:", na_count, "(", na_pct, "%)"))
}
## [1] "temp_bb missing: 0 ( 0 %)"
## [1] "rh missing: 0 ( 0 %)"
## [1] "area_bb missing: 0 ( 0 %)"
## [1] "cloud_mask missing: 1938 ( 8.77 %)"
## [1] "lon_km_utm13 missing: 0 ( 0 %)"
## [1] "lat_km_utm13 missing: 0 ( 0 %)"
## [1] "julian_date missing: 0 ( 0 %)"
Overlay of the observed, predicted, and imputed methane_eq histograms:
plot_overlapped_histograms(model_dataset, title = "Observed vs Predicted vs Imputed, Highest R2 Model")
## Warning: Removed 52577 rows containing non-finite outside the scale range
## (`stat_bin()`).